#Import and Clean Wave 1 Data
rm(list=ls())
if(!is.null(sessionInfo()$otherPkgs)){
  invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))  
}

if (!require(pacman)){
  install.packages(pacman)
  library(pacman)
}

p_load(dplyr,
       stringr,
       readr,
       stringi,
       haven,
       matrixStats)
alldat <- read_dta("../Data/wave1.dta")

labels <- data.frame(varname = colnames(alldat),label=unlist(lapply(alldat,function(x){ifelse(is.null(attributes(x)$label),NA,attributes(x)$label)})))

  


alldat[alldat==99] <- NA
dta <- alldat %>% filter(Attn_check == 4, Duration__in_seconds_ >= 800, Consent_Intro == 1 | Consent_Intro_EU == 1,Finished==1, !(territ_loss_concern == 50 & Pol_15 < 3),!str_detect(Pol_20a,'\\bde.tschl.nd\\b|(?<!.)\\?|^$|(?<!.)-|\\bmagyarorsz.g\\b|maagyarorsz.g$|hungary$|magyar$|magyarororszag$|magyarorsz?gon|magyarotsz?g|t.rk.ye$|.stanbul|turkey')) %>% 
  dplyr::select(#Weights
    ResponseId,
    starts_with("wt"),
    #Demographics
    male, age, educ_univ_complete,educ_sec_techvoc,educ_univ_any,educ_sec_univprep, rural, city, town, village, contains("homelang"),
    #Geocoding
    starts_with("County"), Bundesland_Germ, Province_Turk, hun, rom, ger, tur, Country,
    #Experimental treatments
    starts_with("version"),
    #Outcomes
    territ_loss_concern, contains("hist"),
    #political parties
    starts_with("Pol_24"),
    ends_with("_ind"),
    neigh_lgbt,
    relig_import01,
    pol_int01,
    Pol_20a, 
    Pol_15,
    starts_with("Pol_21"), 
    starts_with("Pol_7"),
    starts_with("Pol_8"), 
    starts_with("Pol_16"),
    starts_with("Pol_22"),
    starts_with("Demo_9"),
    starts_with("like")
) %>% rename(
         namedterritory = Pol_20a,
         autho = Pol_16_1,
         technoc = Pol_16_2,
         democ = Pol_16_3,
         pol_int = Pol_15,
         territorial = version_12,
         territorial_prime = version_13,
         cultid = Demo_9_1,
         pplid = Demo_9_2,
         statid = Demo_9_3
  ) %>%
  mutate(
         covid_civlib_restrict_ind = (covid_civlib_restrict_ind-min(alldat$covid_civlib_restrict_ind,na.rm=T))/
           (max(alldat$covid_civlib_restrict_ind,na.rm=T)-min(alldat$covid_civlib_restrict_ind,na.rm=T)),
         econ_intervention_ind = (econ_intervention_ind-min(alldat$econ_intervention_ind,na.rm=T))/
           (max(alldat$econ_intervention_ind,na.rm=T)-min(alldat$econ_intervention_ind,na.rm=T)),
         pro_global_ind = (pro_global_ind-min(alldat$pro_global_ind,na.rm=T))/
           (max(alldat$pro_global_ind,na.rm=T)-min(alldat$pro_global_ind,na.rm=T)),
         democ_ind = (democ_ind-min(alldat$democ_ind,na.rm=T))/
           (max(alldat$democ_ind,na.rm=T)-min(alldat$democ_ind,na.rm=T)),
         natid_ind = (natid_ind-min(alldat$natid_ind,na.rm=T))/
           (max(alldat$natid_ind,na.rm=T)-min(alldat$natid_ind,na.rm=T)),
         neigh_lgbt = (neigh_lgbt-min(alldat$neigh_lgbt,na.rm=T))/
           (max(alldat$neigh_lgbt,na.rm=T)-min(alldat$neigh_lgbt,na.rm=T)),
         relig_import01 = (relig_import01-min(alldat$relig_import01,na.rm=T))/
           (max(alldat$relig_import01,na.rm=T)-min(alldat$relig_import01,na.rm=T)),
         pol_int01 = (pol_int01-min(alldat$pol_int01,na.rm=T))/
           (max(alldat$pol_int01,na.rm=T)-min(alldat$pol_int01,na.rm=T)),
         territ_loss_concern_scaled = (territ_loss_concern-min(alldat$territ_loss_concern,na.rm=T))/
           (max(alldat$territ_loss_concern,na.rm=T)-min(alldat$territ_loss_concern,na.rm=T)),
         voted = coalesce(Pol_8H,Pol_8R,Pol_8G,Pol_8T),
         like_csu = if_else(Bundesland_Germ ==4,like_cducsu,NA),
         voted_iohannis = as.numeric(Pol_8aR==1),
         voted_dancila = as.numeric(Pol_8aR==2),
         voted_barna = as.numeric(Pol_8aR==3),
         voted_diaconu = as.numeric(Pol_8aR==4),
         voted_paleologu = as.numeric(Pol_8aR == 5),
         voted_fidesz = as.numeric(Pol_8aH==1),
         voted_jobbik = as.numeric(Pol_8aH==2),
         voted_nonntl_hun = as.numeric(Pol_8aH==3|Pol_8aH==4|Pol_8aH==5|Pol_8aH==6|Pol_8aH==7),
         voted_akp = as.numeric(Pol_8aT==1),
         voted_mhp = as.numeric(Pol_8aT==2),
         voted_chp = as.numeric(Pol_8aT==3),
         voted_iyi = as.numeric(Pol_8aT==4),
         voted_hdp = as.numeric(Pol_8aT==5),
         voted_cdu = as.numeric(Pol_8aG==1),
         voted_spd = as.numeric(Pol_8aG==2),
         voted_grune = as.numeric(Pol_8aG==4),
         voted_afd = as.numeric(Pol_8aG==6),
         D = as.numeric(as.factor(Version!=3)),
         romhist_post2007 = as.numeric(Pol_22R == 1998.5),
         gerhist_post2009 = as.numeric(Pol_22G == 1998.5),
         gerhist_reich = as.numeric(Pol_22G %in% c(2007.5,2009.5)),
         wt_multip_w1_GE = ifelse(ger, wt_multip_w1_GE, NA),
         wt_multip_w1_HU = ifelse(hun, wt_multip_w1_HU, NA),
         wt_multip_w1_RO = ifelse(rom, wt_multip_w1_RO, NA),
         wt_multip_w1_TR = ifelse(tur, wt_multip_w1_TR, NA),
         hist = as.factor(coalesce(Pol_22H,Pol_22R,Pol_22T,Pol_22G)),
         wt = coalesce(wt_multip_w1_GE,wt_multip_w1_HU,wt_multip_w1_RO,wt_multip_w1_TR),
         approvegovt = coalesce(Pol_7G,Pol_7R,Pol_7H,Pol_7T),
         culpro = coalesce(Pol_21H_1,Pol_21R_1,Pol_21T_1),
         citiz = coalesce(Pol_21H_2,Pol_21R_2,Pol_21T_2),
         diplo = coalesce(Pol_21H_3,Pol_21R_3,Pol_21T_3),
         sanct = coalesce(Pol_21H_4,Pol_21R_4,Pol_21T_4),
         milit = coalesce(Pol_21H_5,Pol_21R_5,Pol_21T_5),
         diasscale = sum(c(culpro,citiz,diplo,sanct,milit),na.rm=T),
         educ_univ = educ_univ_complete | educ_univ_any,
         hist_greatestextent = coalesce(romhist_1938,hunhist_pre18,turhist_pre1922,gerhist_reich),
         hist_present = coalesce(romhist_post2007,hunhist_post2010,turhist_post2002,gerhist_post2009),
         ch = names(attr(County_Hung,"labels"))[match(County_Hung,attr(County_Hung,"labels"))],
         cr = names(attr(County_Rom,"labels"))[match(County_Rom,attr(County_Rom,"labels"))],
         ct = names(attr(Province_Turk,"labels"))[match(Province_Turk,attr(Province_Turk,"labels"))],
         cg = names(attr(Bundesland_Germ,"labels"))[match(Bundesland_Germ,attr(Bundesland_Germ,"labels"))],
         region = as.factor(coalesce(cr,ch,ct,cg)),
         County_Hung = as.factor(County_Hung),
         County_Rom = as.factor(County_Rom),
         Bundesland_Germ = as.factor(Bundesland_Germ),
         Province_Turk = as.factor(Province_Turk),
         Country = as.factor(Country),
         populist_like_hun = rowMeans(select(., like_jobbik,like_fidesz),na.rm=T),
         nationalist_like_hun = rowMeans(select(., like_jobbik,like_fidesz),na.rm=T),
         populist_like_hun_max = rowMaxs(as.matrix(select(., like_jobbik,like_fidesz)),na.rm=T),
         populist_like_hun_max = replace(populist_like_hun_max,populist_like_hun_max==-Inf,NA),
         nationalist_like_hun_max = rowMaxs(as.matrix(select(., like_jobbik,like_fidesz)),na.rm=T),
         nationalist_like_hun_max = replace(nationalist_like_hun_max,nationalist_like_hun_max==-Inf,NA),
         populist_like_rom = like_prm,
         populist_like_rom_rob = rowMeans(select(., like_prm,like_pmp),na.rm=T),
         populist_like_rom_max = rowMaxs(as.matrix(select(., like_prm,like_pmp)),na.rm=T),
         populist_like_rom_max = replace(populist_like_rom_max,populist_like_rom_max==-Inf,NA),
         nationalist_like_rom_max = rowMaxs(as.matrix(select(., like_prm,like_pmp,like_psd)),na.rm=T),
         nationalist_like_rom_max = replace(nationalist_like_rom_max,nationalist_like_rom_max==-Inf,NA),
         populist_like_tur = rowMeans(select(., like_akp,like_mhp),na.rm=T),
         populist_like_tur_max = rowMaxs(as.matrix(select(., like_akp,like_mhp)),na.rm=T),
         populist_like_tur_max = replace(populist_like_tur_max,populist_like_tur_max==-Inf,NA),
         kemalist_like = rowMeans(select(., like_chp,like_iyi),na.rm=T),
         kemalist_like_max = rowMaxs(as.matrix(select(., like_chp,like_iyi)),na.rm=T),
         kemalist_like_max = replace(kemalist_like_max,kemalist_like_max==-Inf,NA),
         nationalist_like_tur_max = rowMaxs(as.matrix(select(., like_akp,like_mhp,like_iyi)),na.rm=T),
         nationalist_like_tur_max = replace(nationalist_like_tur_max,nationalist_like_tur_max==-Inf,NA),
         populist_like_ger = like_afd,
         kemalist_like_max = replace(kemalist_like_max,kemalist_like_max==-Inf,NA),
         nationalist_like_ger_max = case_when(
           (Bundesland_Germ ==4 & (like_csu > like_afd)) ~ like_csu,
           .default = like_afd
         ),
         nationalist_like_ger_max = replace(nationalist_like_ger_max,nationalist_like_ger_max==-Inf,NA),
         populist_only_ger = like_linke,
         populist_like_all = coalesce(populist_like_hun,
                                      populist_like_rom,
                                      populist_like_tur,
                                      populist_like_ger),
         populist_like_all_onep = coalesce(like_fidesz,like_prm,like_akp,like_afd),
         populist_like_all_max = coalesce(populist_like_hun_max,
                                          populist_like_rom_max,
                                          populist_like_tur_max,
                                          populist_like_ger),
         nationalist_like_all_max = coalesce(nationalist_like_hun_max,
                                             nationalist_like_rom_max,
                                             nationalist_like_tur_max,
                                             nationalist_like_ger_max),
         nationalist_only_like_all = coalesce(like_csu,
                                              like_psd,
                                              like_iyi),
         populist_in_power = as.numeric(Country %in% c(76,179)),
         populist_like_in_power = coalesce(like_fidesz,like_akp),
         populist_like_not_in_power = coalesce(like_jobbik, populist_like_rom_max,like_afd),
         populist_voted = coalesce(voted_fidesz,voted_paleologu,voted_iyi,voted_afd),
         national_language= coalesce(hun_homelang_hunonly,ger_homelang_geronly,rom_homelang_romonly,tur_homelang_turonly),
         territorial = as.factor(territorial)
         ) %>%
  select(!starts_with("Pol",ignore.case = F)) %>%
  filter(!is.na(region))
#Code lost territory statements
#Germany
losssurveygermany <- dta %>% filter(Country == 1358)
losssurveyromania <- dta %>% filter(Country == 142)
losssurveyturkey <- dta %>% filter(Country == 179)
losssurveyhungary <- dta %>% filter(Country == 76)
gt <- tolower(losssurveygermany$namedterritory)
gdk <- str_detect(gt,'(?<!.)-|kein|nicht|weiss|weiß|k\\.a\\.|ahnung|mir egal|(?<!.)\\.\\.\\.|umfrage|lol|k\\/a|don\'t|(?<!.)ka|idk|xx+|blödsinn|ßß+|bbv+|^$|(?<!.)k\\.|\\d|no idea|nein|(?<!.)\\?')&!str_detect(gt,'preu')
#gt[gdk]
silesiapommerania <- str_detect(gt,'polen|schlrsien|sclesien|schlesien|schliesen|posen|danzig|pommern|ostgebiete|poland|neiße|(?<!ost)preu|pölen')
#Including these together since only a handful mentioned saarland, but arguably should be separate since that's no longer lost
alsacesaarland <- str_detect(gt,'elsas|elsäss|elsaß|alsas|alsace|lothringen|frankreic|saar|france|straßburg')
#Over a hundred people said something about Germany - some appear to be referring to the loss of east germany, one is just a copy-paste of the question (bot?), but overall these people appear not to have understood the question, at least not as it was intended
germany <- str_detect(gt,'de.tschl.nd|deutschland(?!sudeten)|berlin|(?<!.)ddr')
sudetenland <- str_detect(gt,'sudet|tschech')
austria <- str_detect(gt,'österreich|austria')
russia <- str_detect(gt,'russ|könig|kalin|ostpre|ruß')
gt[!(gdk+silesiapommerania+alsacesaarland+germany+sudetenland+russia+austria)]
losssurveygermany <- losssurveygermany %>%
  mutate(dk = as.numeric(gdk),poland = as.numeric(silesiapommerania), france = as.numeric(alsacesaarland), germany = as.numeric(germany), sudetenland = as.numeric(sudetenland),austria = as.numeric(austria), russia = as.numeric(russia),other = as.numeric(!(gdk+silesiapommerania+alsacesaarland+germany+sudetenland+russia+austria)))
#About 200 people who speak German said strange things, many of which seem to indicate either being from another country or having a very idiosyncratic grasp of history (e.g. Crimea, Syria, Palestine,Italy Spain)
losssurveygermany %>%
  group_by(ger_homelang_geronly,other) %>%
  summarise(n = n())
#Romania
rt <- tolower(losssurveyromania$namedterritory)
rdk <- str_detect(rt,'stiu|(?<!.)nu|amint|know|(?<!.)\\?|^$')
cadrilaterul <- str_detect(rt,'.adril|bulga|dobr|silist|balcic')
bucovina <- str_detect(rt,'bu.ovin|u.rai|cern.u.i')
bessarabia <- str_detect(rt,'basarab|basarbia|mold|arabia|chis|chi.in.u')
russiasovet <- str_detect(rt,'urss|rus+i|uniun|soviet')
transylvaniaszekely <- str_detect(rt,'rom.nia|trans.l|ardeal|erdely|erdély|s.+cuiesc')
hungary <- str_detect(rt,'ungar|magyar')
snakeisland <- str_detect(rt,'(?<!pen)insula|serpilor')
italy <- str_detect(rt,'itali')
other <- !(rdk+cadrilaterul+bucovina+russiasovet+bessarabia+transylvaniaszekely+hungary+snakeisland+italy)
losssurveyromania <- losssurveyromania %>%
  mutate(dk = as.numeric(rdk),cadrilaterul = as.numeric(cadrilaterul), bucovina = as.numeric(bucovina), bessarabia = as.numeric(bessarabia), russiasoviet = as.numeric(russiasovet),transylvaniaszekely = as.numeric(transylvaniaszekely), hungary = as.numeric(hungary),snakeisland = as.numeric(snakeisland), italy = as.numeric(italy),other = as.numeric(other))
#Hungary
ht <- tolower(losssurveyhungary$namedterritory)
hdk <- str_detect(ht,'nem|tudom|kérdés|(?<!.)nem|(?<!.)\\?|^$')
romaniatransylvania <- str_detect(ht,'rom\\,ánia|rom.ni.|trans.l|erd.l|erdè|erdá.y|erdélly|erdèl.|érmellék|e.dély|szatmárnémeti')
transcarpathia <- str_detect(ht,'k.rp.t|ukr.') 
upperhungary <- str_detect(ht,'szlov.kia|felvid.k|fel-vidék|felidék') 
vojevodina <- str_detect(ht,'vajdas.g|b.ns.g|b.n.t|szerb|bácska|szabadka|jugoszlávia|d.l-vid.k|d.lvid.k') 
szekelyland <- str_detect(ht,'sz.k.l') 
austria <- str_detect(ht,'ausztria|burge.land|újvidék|austria|isztria|.rvidék|.sterreich|bécs|ujvidek|borsmonostor')
hungary <- str_detect(ht,'magyarorsz.g$|maagyarorsz.g$|hungary$|magyar$|ungaria|magyarororszag$|magyarországon|magyarotszág')
croatia <- str_detect(ht,'horv|croatia')
slovakia <- str_detect(ht,'szlov.k|pozsony')
mentiontrianon <- str_detect(ht,'tria.on|feldarabolása')
other <- !(hdk+romaniatransylvania+transcarpathia+upperhungary+vojevodina+szekelyland+austria+hungary+croatia+slovakia+mentiontrianon)
losssurveyhungary <- losssurveyhungary %>%
  mutate(dk = as.numeric(hdk),romaniatransylvania = as.numeric(romaniatransylvania), transcarpathia = as.numeric(transcarpathia), upperhungary = as.numeric(upperhungary), vojevodina = as.numeric(vojevodina),szekelyland = as.numeric(szekelyland), austria = as.numeric(austria),hungary = as.numeric(hungary), croatia = as.numeric(croatia),slovakia = as.numeric(slovakia),mentiontrianon = as.numeric(mentiontrianon),other = as.numeric(other))
losssurveyhungary %>% summarise(transylvania = sum(romaniatransylvania)/n(),transcarpathia = sum(transcarpathia)/n(),upperhungary = sum(upperhungary)/n(),vojevodina =sum(vojevodina)/n(),szekely = sum(szekelyland)/n(),austria = sum(austria)/n(),croatia = sum(croatia)/n(),slovakia = sum(slovakia)/n())
#Turkey
tt <- tolower(losssurveyturkey$namedterritory)
tdk <- str_detect(tt,'(?<!.)-+|(?<!.)xx+|bilmiy.r.m|yok|(?<!.)\\?|^$|\\d$|hat.rlam.yorum|bilmiyom|(?<!.)\\.+|nilmiyorum|gelmiyor|bilmyom|anlamadım|hatirlayamadim|bilmiom|hatirlamiyom|bilmiyoru')
greece <- str_detect(tt,'y.n.n|ege|adalar|selanik|ada|greece|girit')
iraq <-  str_detect(tt,'musul|irak|.rak|kerk.k|ba.dat|kerk.t|iraq')
bulgaria <-  str_detect(tt,'bulga.i|trakya|bulgar.stan|bulgar')
otherbalkans <-  str_detect(tt,'balkan|bosna|s.rb.stan|kosov|mak.donya|yugoslavya|slovakya|hırvatistan|arnavutluk|romanya|eflak|h.rvatistan|balksnla')
syria <-  str_detect(tt,'suriye|s.l.yman|s.r.ye|syria|şam')
caucasus <-  str_detect(tt,'ermeni|batum|az.rb.ycan|g.rcistan|flis') 
turkey <- str_detect(tt,'t.rk.ye$|.stanbul|turkey') 
franceengland <- str_detect(tt,'fra.sa|.ngiltere') 
cyprus <- str_detect(tt,'k.br.s|kktc|kkk|kibriz|kipr') 
russiacrimea <- str_detect(tt,'rusya|k.r.m') 
iran <- str_detect(tt,'.ran(?!sa)') 
israel <- str_detect(tt,'.sra.l|filistin|kudüs') 
austrohungary <- str_detect(tt,'macari|avusturya|viyana')
americagermany <- str_detect(tt,'amer.ka|almanya')
arabianorthafrica <- str_detect(tt,'m.s.r|arabistan|garp|libya|fas|tunus|kahire|mekke|trablusgarb|cezayir|yemen')
italy <- str_detect(tt,'.taly|vatikan')
spain <- str_detect(tt,'.spanya|.nd.l.s')
other <- !(tdk+greece+iraq+syria+bulgaria+otherbalkans+caucasus+turkey+franceengland+cyprus+russiacrimea+iran+israel+austrohungary+americagermany+arabianorthafrica+spain+italy)
#tt[other]
losssurveyturkey <- losssurveyturkey %>%
  mutate(dk = as.numeric(tdk),greece = as.numeric(greece), iraq = as.numeric(iraq), bulgaria = as.numeric(bulgaria), otherbalkans = as.numeric(otherbalkans),syria = as.numeric(syria), caucasus = as.numeric(caucasus),turkey = as.numeric(turkey), franceengland = as.numeric(franceengland),cyprus = as.numeric(cyprus),russiacrimea = as.numeric(russiacrimea),iran = as.numeric(iran), israel = as.numeric(israel),austrohungary = as.numeric(austrohungary),americagermany = as.numeric(americagermany), arabianorthafrica = as.numeric(arabianorthafrica),italy = as.numeric(italy),spain = as.numeric(spain),other = as.numeric(other))

dta <-suppressWarnings(bind_rows(losssurveygermany,losssurveyhungary,losssurveyromania,losssurveyturkey))
dta <- dta %>% 
  mutate(territoryonly = if_else((iraq|
                            otherbalkans|
                            syria|
                            caucasus|
                            franceengland|
                            russiacrimea|
                            israel|
                            austrohungary|
                            arabianorthafrica|
                            cadrilaterul|
                            snakeisland|
                            sudetenland|
                            austria|
                            russia
                        ), 1,0,0),
         lostunity = if_else((greece|
                        bulgaria|
                        cyprus|
                        romaniatransylvania|
                        transcarpathia|
                        upperhungary|
                        vojevodina|
                        szekelyland|
                        croatia|
                        slovakia|
                        mentiontrianon|
                        bucovina|
                        bessarabia|
                        poland|
                        france|
                        austria
         ), 1,0,0),
         both = as.numeric(territoryonly & lostunity)
  )
         
dtano4 <- dta %>% filter(Version!=4)
#Classify losses for Germany

saveRDS(dta, "../Data/clean_data.RDS")
saveRDS(dtano4, "../Data/clean_data_noV4.RDS")

